{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Helpdesk Hard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "import findspark\n",
    "import pandas as pd\n",
    "findspark.init()\n",
    "\n",
    "SVR = '192.168.31.31'\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql import Window\n",
    "\n",
    "sc = (SparkSession.builder.appName('app12-3') \n",
    "      .master(f'spark://{SVR}:7077') \n",
    "      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') \n",
    "      .config('spark.cores.max', '4') \n",
    "      .config('spark.executor.instances', '1') \n",
    "      .config('spark.executor.cores', '2') \n",
    "      .config('spark.executor.memory', '10g') \n",
    "      .enableHiveSupport().getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "shift = sc.read.table('sqlzoo.Shift')\n",
    "staff = sc.read.table('sqlzoo.Staff')\n",
    "issue = sc.read.table('sqlzoo.Issue')\n",
    "shift_type = sc.read.table('sqlzoo.Shift_type')\n",
    "level = sc.read.table('sqlzoo.Level')\n",
    "customer = sc.read.table('sqlzoo.Customer')\n",
    "caller = sc.read.table('sqlzoo.Caller')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11.\n",
    "Show the manager and number of calls received for each hour of the day on 2017-08-12\n",
    "\n",
    "```\n",
    "+---------+---------------+----+\n",
    "| Manager | Hr            | cc |\n",
    "+---------+---------------+----+\n",
    "| LB1     | 2017-08-12 08 |  6 |\n",
    "| LB1     | 2017-08-12 09 | 16 |\n",
    "| LB1     | 2017-08-12 10 | 11 |\n",
    "| LB1     | 2017-08-12 11 |  6 |\n",
    "| LB1     | 2017-08-12 12 |  8 |\n",
    "| LB1     | 2017-08-12 13 |  4 |\n",
    "| AE1     | 2017-08-12 14 | 12 |\n",
    "| AE1     | 2017-08-12 15 |  8 |\n",
    "| AE1     | 2017-08-12 16 |  8 |\n",
    "| AE1     | 2017-08-12 17 |  7 |\n",
    "| AE1     | 2017-08-12 19 |  5 |\n",
    "+---------+---------------+----+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Manager</th>\n",
       "      <th>hr</th>\n",
       "      <th>cc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 08</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 09</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 11</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 12</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>LB1</td>\n",
       "      <td>2017-08-12 13</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>AE1</td>\n",
       "      <td>2017-08-12 14</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>AE1</td>\n",
       "      <td>2017-08-12 15</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>AE1</td>\n",
       "      <td>2017-08-12 16</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>AE1</td>\n",
       "      <td>2017-08-12 17</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>AE1</td>\n",
       "      <td>2017-08-12 19</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Manager             hr  cc\n",
       "0      LB1  2017-08-12 08   6\n",
       "1      LB1  2017-08-12 09  16\n",
       "2      LB1  2017-08-12 10  11\n",
       "3      LB1  2017-08-12 11   6\n",
       "4      LB1  2017-08-12 12   8\n",
       "5      LB1  2017-08-12 13   4\n",
       "6      AE1  2017-08-12 14  12\n",
       "7      AE1  2017-08-12 15   8\n",
       "8      AE1  2017-08-12 16   8\n",
       "9      AE1  2017-08-12 17   7\n",
       "10     AE1  2017-08-12 19   5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(issue.withColumn('date_call', to_date(issue['Call_date']))\n",
    " .withColumn('hr', date_format(issue['Call_date'], 'yyyy-MM-dd HH'))\n",
    " .filter(col('date_call')=='2017-08-12')\n",
    " .join(shift, on=(col('Taken_by')==col('Operator')) & \n",
    "       (col('date_call')==col('Shift_date')))\n",
    " .groupBy('Manager', 'hr')\n",
    " .agg(count('Call_ref').alias('cc'))\n",
    " .orderBy('hr')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12.\n",
    "**80/20 rule. It is said that 80% of the calls are generated by 20% of the callers. Is this true? What percentage of calls are generated by the most active 20% of callers.**\n",
    "\n",
    "Note - Andrew has not managed to do this in one query - but he believes it is possible.\n",
    "\n",
    "```\n",
    "+---------+\n",
    "| t20pc   |\n",
    "+---------+\n",
    "| 32.2581 |\n",
    "+---------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>t20pc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>32.258065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       t20pc\n",
       "0  32.258065"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(issue.groupBy('Caller_id')\n",
    " .agg(count('Caller_id').alias('n'))\n",
    " .orderBy(col('n').desc())\n",
    " .limit(int(issue[['Caller_id']].distinct().count() * 0.2))\n",
    " .agg(sum('n').alias('sum_n'))\n",
    " .withColumn('t20pc', 100 * col('sum_n') / issue.count())\n",
    " .select('t20pc')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13.\n",
    "**Annoying customers. Customers who call in the last five minutes of a shift are annoying. Find the most active customer who has never been annoying.**\n",
    "\n",
    "```\n",
    "+--------------+------+\n",
    "| Company_name | abna |\n",
    "+--------------+------+\n",
    "| High and Co. |   20 |\n",
    "+--------------+------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Company_ref</th>\n",
       "      <th>Company_name</th>\n",
       "      <th>abna</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>146</td>\n",
       "      <td>High and Co.</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Company_ref  Company_name  abna\n",
       "0          146  High and Co.    20"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annoy = (issue.withColumn('date_call', to_date(issue['Call_date']))\n",
    "         .join(shift, on=(col('Taken_by')==col('Operator')) & \n",
    "                         (col('date_call')==col('Shift_date')))\n",
    "         .join(shift_type, 'Shift_type')\n",
    "         .join(caller, 'Caller_id')\n",
    "         .join(customer, 'Company_ref')\n",
    "         .select('Company_ref', 'Shift_date', 'End_time', 'Call_date')\n",
    "         .withColumn('Shift_end', concat_ws(\n",
    "             ' ', col('Shift_date').cast('string'), col('End_time')))\n",
    "         .withColumn('till_shiftend', \n",
    "                     (to_timestamp(col('Shift_end')) - to_timestamp(col('Call_date')))\n",
    "                     .cast('long'))\n",
    "         .filter(col('till_shiftend') <= 5*60))\n",
    "\n",
    "(issue.join(caller, 'Caller_id')\n",
    " .join(customer, 'Company_ref')\n",
    " .join(annoy, 'Company_ref', how='left_anti')\n",
    " .groupBy('Company_ref', 'Company_name')\n",
    " .agg(count('Caller_id').alias('abna'))\n",
    " .orderBy(col('abna').desc())\n",
    " .limit(1)\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 14.\n",
    "**Maximal usage. If every caller registered with a customer makes a call in one day then that customer has \"maximal usage\" of the service. List the maximal customers for 2017-08-13.**\n",
    "\n",
    "```\n",
    "+-------------------+--------------+-------------+\n",
    "| company_name      | caller_count | issue_count |\n",
    "+-------------------+--------------+-------------+\n",
    "| Askew Inc.        |            2 |           2 |\n",
    "| Bai Services      |            2 |           2 |\n",
    "| Dasher Services   |            3 |           3 |\n",
    "| High and Co.      |            5 |           5 |\n",
    "| Lady Retail       |            4 |           4 |\n",
    "| Packman Shipping  |            3 |           3 |\n",
    "| Pitiable Shipping |            2 |           2 |\n",
    "| Whale Shipping    |            2 |           2 |\n",
    "+-------------------+--------------+-------------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Company_name</th>\n",
       "      <th>caller_count</th>\n",
       "      <th>issue_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Askew Inc.</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bai Services</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Dasher Services</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>High and Co.</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Lady Retail</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Packman Shipping</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Pitiable Shipping</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Whale Shipping</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Company_name  caller_count  issue_count\n",
       "0         Askew Inc.             2            2\n",
       "1       Bai Services             2            2\n",
       "2    Dasher Services             3            3\n",
       "3       High and Co.             5            5\n",
       "4        Lady Retail             4            4\n",
       "5   Packman Shipping             3            3\n",
       "6  Pitiable Shipping             2            2\n",
       "7     Whale Shipping             2            2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iss = (issue.filter((col('Call_date')>='2017-08-13') &\n",
    "                    (col('Call_date')<'2017-08-14'))\n",
    "       .join(caller, 'Caller_id', how='right')\n",
    "       .join(customer, 'Company_ref', how='left')\n",
    "       .groupBy('Caller_id', 'Company_ref', 'Company_name')\n",
    "       .agg(count('Call_ref').alias('n'))\n",
    "       .withColumn('iss', (col('n')>0).cast('integer')))\n",
    "\n",
    "(iss.groupBy('Company_name')\n",
    " .agg(count('Company_ref').alias('caller_count'),\n",
    "      sum('iss').alias('issue_count'))\n",
    " .filter(col('caller_count')==col('issue_count'))\n",
    " .orderBy('Company_name')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 15.\n",
    "**Consecutive calls occur when an operator deals with two callers within 10 minutes. Find the longest sequence of consecutive calls – give the name of the operator and the first and last call date in the sequence.**\n",
    "\n",
    "```\n",
    "+----------+---------------------+---------------------+-------+\n",
    "| taken_by | first_call          | last_call           | calls |\n",
    "+----------+---------------------+---------------------+-------+\n",
    "| AB1      | 2017-08-14 09:06:00 | 2017-08-14 10:17:00 |    24 |\n",
    "+----------+---------------------+---------------------+-------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Taken_by</th>\n",
       "      <th>first_call</th>\n",
       "      <th>last_call</th>\n",
       "      <th>n_calls</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AB1</td>\n",
       "      <td>2017-08-14 09:06:00.0</td>\n",
       "      <td>2017-08-14 10:17:00.0</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Taken_by             first_call              last_call  n_calls\n",
       "0      AB1  2017-08-14 09:06:00.0  2017-08-14 10:17:00.0       24"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(issue.withColumn('lag', lag('Call_date', 1).over(\n",
    "    Window.partitionBy('Taken_by').orderBy(['Taken_by', 'Call_date'])))\n",
    " .withColumn('flag', ((to_timestamp(col('Call_date')) - \n",
    "                       to_timestamp(col('lag'))).cast('long') > 10*60).cast('integer'))\n",
    " .withColumn('grp', sum('flag').over(\n",
    "     Window.partitionBy('Taken_by').orderBy(['Taken_by', 'Call_date'])))\n",
    " .groupBy('Taken_by', 'grp')\n",
    " .agg(min('Call_date').alias('first_call'),\n",
    "      max('Call_date').alias('last_call'),\n",
    "      count('Caller_id').alias('n_calls'))\n",
    " .orderBy(col('n_calls').desc())\n",
    " .select('Taken_by', 'first_call', 'last_call', 'n_calls')\n",
    " .limit(1)\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
